Comprehensive Community Composition & Diversity Analysis of NEON MAGs with Soil Chemistry

Author

Tutorial

Published

December 3, 2025

Introduction

This comprehensive tutorial combines: - Taxonomic Profiling - Alpha Diversity (Shannon, Simpson, Richness) - Beta Diversity (Bray-Curtis) - Ordination (PCoA, NMDS) - Constrained Ordination (CCA, RDA) with soil chemistry - Color-coded biplots for environmental variables - Interactive plots using Plotly - Variance explained and significance tests

Each section includes explanations for interpretation.

Step 1: Load and Prepare Data

Explanation: Extract sample identifiers and soil chemistry variables.

R code
df <- read_csv("NEON_soilMAGs_soilChem.csv")

df <- df %>%
  filter(!is.na(genomicsSampleID)) %>%
  mutate(siteID = str_extract(genomicsSampleID, "^[A-Z]+"),
         subplot = str_extract(genomicsSampleID, "(?<=_)[0-9]+"),
         layer = str_extract(genomicsSampleID, "(?<=-)[OM]"),
         sampleID = paste(siteID, subplot, layer, sep = "_")) %>%
  select(sampleID, phylum, soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH)

Step 2: Taxonomic Profiling

Explanation: Shows relative abundance of phyla per sample.

R code
tax_profile <- df %>%
  group_by(sampleID, phylum) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(rel_abundance = count / sum(count))

ggplot(tax_profile, aes(x = sampleID, y = rel_abundance, fill = phylum)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Taxonomic Profile by Sample", y = "Relative Abundance") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), legend.position = "right")

Step 3: Alpha Diversity

Explanation: Shannon = richness + evenness, Simpson = dominance, Richness = number of taxa.

R code
abundance_matrix <- tax_profile %>%
  pivot_wider(names_from = phylum, values_from = count, values_fill = 0)
abundance_matrix <- abundance_matrix %>% distinct(sampleID, .keep_all = TRUE)
rn <- abundance_matrix$sampleID
abundance_matrix <- abundance_matrix %>% select(-sampleID)
abundance_matrix <- as.data.frame(lapply(abundance_matrix, as.numeric))
rownames(abundance_matrix) <- rn
abundance_matrix <- abundance_matrix[rowSums(abundance_matrix) > 0, ]

shannon <- diversity(abundance_matrix, index = "shannon")
simpson <- diversity(abundance_matrix, index = "simpson")
richness <- specnumber(abundance_matrix)
alpha_div <- tibble(sampleID = rownames(abundance_matrix), Shannon = shannon, Simpson = simpson, Richness = richness)
alpha_div
# A tibble: 292 × 4
   sampleID   Shannon  Simpson Richness
   <chr>        <dbl>    <dbl>    <int>
 1 BONA_001_O 0.00173 0.000359        2
 2 BONA_002_O 0.694   0.500           3
 3 BONA_004_O 0.00173 0.000359        2
 4 BONA_006_O 0.694   0.500           3
 5 BONA_009_O 1.10    0.667           4
 6 BONA_013_O 0.00173 0.000359        2
 7 BONA_070_O 0.694   0.500           3
 8 BONA_071_O 0.00173 0.000359        2
 9 BONA_080_O 0.00173 0.000359        2
10 BONA_084_O 0.694   0.500           3
# ℹ 282 more rows

Step 4: Beta Diversity

Explanation: Bray-Curtis measures compositional dissimilarity.

R code
bray_dist <- vegdist(abundance_matrix, method = "bray")
as.matrix(bray_dist)[1:5, 1:5]
           BONA_001_O BONA_002_O BONA_004_O BONA_006_O BONA_009_O
BONA_001_O  0.0000000  0.5237892  0.2941176  0.5789315  0.6999838
BONA_002_O  0.5237892  0.0000000  0.3749650  0.5555157  0.3684121
BONA_004_O  0.2941176  0.3749650  0.0000000  0.4285421  0.5999713
BONA_006_O  0.5789315  0.5555157  0.4285421  0.0000000  0.2941065
BONA_009_O  0.6999838  0.3684121  0.5999713  0.2941065  0.0000000

Step 5: Ordination (PCoA and NMDS)

Explanation: Visualize sample relationships in reduced dimensions.

R code
pcoa_coords <- cmdscale(bray_dist, k = 2)
pcoa_df <- as_tibble(pcoa_coords) %>% mutate(sampleID = rownames(pcoa_coords))

ggplot(pcoa_df, aes(x = V1, y = V2, label = sampleID)) + geom_point() + geom_text(vjust = -0.5) + labs(title = "PCoA (Bray-Curtis)")

R code
nmds <- metaMDS(abundance_matrix, distance = "bray", k = 2)
Wisconsin double standardization
Run 0 stress 0.08532757 
Run 1 stress 0.1200854 
Run 2 stress 0.08703641 
Run 3 stress 0.09333725 
Run 4 stress 0.08902399 
Run 5 stress 0.1081445 
Run 6 stress 0.08492605 
... New best solution
... Procrustes: rmse 0.01883282  max resid 0.2615482 
Run 7 stress 0.08791272 
Run 8 stress 0.1139224 
Run 9 stress 0.0854949 
Run 10 stress 0.1158335 
Run 11 stress 0.119714 
Run 12 stress 0.09171769 
Run 13 stress 0.08640987 
Run 14 stress 0.08262548 
... New best solution
... Procrustes: rmse 0.008558615  max resid 0.06322369 
Run 15 stress 0.08490224 
Run 16 stress 0.08386775 
Run 17 stress 0.1077224 
Run 18 stress 0.08189793 
... New best solution
... Procrustes: rmse 0.008302257  max resid 0.0633948 
Run 19 stress 0.0937349 
Run 20 stress 0.08654519 
*** Best solution was not repeated -- monoMDS stopping criteria:
    10: no. of iterations >= maxit
     8: stress ratio > sratmax
     2: scale factor of the gradient < sfgrmin
R code
nmds_df <- as_tibble(nmds$points) %>% mutate(sampleID = rownames(nmds$points))

ggplot(nmds_df, aes(x = MDS1, y = MDS2, label = sampleID)) + geom_point() + geom_text(vjust = -0.5) + labs(title = "NMDS (Bray-Curtis)")

Step 6: Soil Chemistry Integration for CCA and RDA with Color-Coded Interactive Biplots

Explanation: Arrows show environmental influence; color-coded by variable type.

R code
soil_data <- df %>% select(sampleID, soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH) %>% distinct(sampleID, .keep_all = TRUE)
soil_data <- soil_data %>% filter(sampleID %in% rownames(abundance_matrix))
soil_data <- column_to_rownames(soil_data, "sampleID")
soil_data <- as.data.frame(lapply(soil_data, as.numeric))
rownames(soil_data) <- rownames(abundance_matrix)

variable_types <- tibble(variable = c("soilMoisture", "soilTemp", "soilInWaterpH", "soilInCaClpH"),
                         type = c("Moisture", "Temperature", "pH", "pH"))

# CCA
cca_model <- cca(abundance_matrix ~ ., data = soil_data)
cca_var <- summary(cca_model)$concont$importance[2,]
cca_sites <- scores(cca_model, display = "sites")
cca_env <- scores(cca_model, display = "bp")
cca_df <- as.data.frame(cca_sites)
cca_df$sampleID <- rownames(cca_df)
cca_env_df <- as.data.frame(cca_env)
cca_env_df$variable <- rownames(cca_env_df)
cca_env_df <- left_join(cca_env_df, variable_types, by = "variable")

p_cca <- ggplot() +
  geom_point(data = cca_df, aes(x = CCA1, y = CCA2, text = sampleID), color = "blue") +
  geom_segment(data = cca_env_df, aes(x = 0, y = 0, xend = CCA1, yend = CCA2, color = type, text = variable),
               arrow = arrow(length = unit(0.3, "cm"))) +
  geom_text(data = cca_env_df, aes(x = CCA1, y = CCA2, label = variable), vjust = -0.5) +
  scale_color_manual(values = c("Moisture" = "blue", "Temperature" = "orange", "pH" = "green")) +
  labs(title = paste("CCA Interactive Biplot (Variance Explained: CCA1 =", round(cca_var[1]*100, 1), "%, CCA2 =", round(cca_var[2]*100, 1), "% )"),
       color = "Variable Type") + theme_minimal()

ggplotly(p_cca, tooltip = c("text"))
R code
# RDA
rda_model <- rda(abundance_matrix ~ ., data = soil_data)
rda_var <- summary(rda_model)$cont$importance[2,]
rda_sites <- scores(rda_model, display = "sites")
rda_env <- scores(rda_model, display = "bp")
rda_df <- as.data.frame(rda_sites)
rda_df$sampleID <- rownames(rda_df)
rda_env_df <- as.data.frame(rda_env)
rda_env_df$variable <- rownames(rda_env_df)
rda_env_df <- left_join(rda_env_df, variable_types, by = "variable")

p_rda <- ggplot() +
  geom_point(data = rda_df, aes(x = RDA1, y = RDA2, text = sampleID), color = "darkgreen") +
  geom_segment(data = rda_env_df, aes(x = 0, y = 0, xend = RDA1, yend = RDA2, color = type, text = variable),
               arrow = arrow(length = unit(0.3, "cm"))) +
  geom_text(data = rda_env_df, aes(x = RDA1, y = RDA2, label = variable), vjust = -0.5) +
  scale_color_manual(values = c("Moisture" = "blue", "Temperature" = "orange", "pH" = "green")) +
  labs(title = paste("RDA Interactive Biplot (Variance Explained: RDA1 =", round(rda_var[1]*100, 1), "%, RDA2 =", round(rda_var[2]*100, 1), "% )"),
       color = "Variable Type") + theme_minimal()

ggplotly(p_rda, tooltip = c("text"))

Interpretation

  • Arrow direction: Correlation with ordination axes.
  • Arrow length: Strength of influence.
  • Colors: Blue = Moisture, Orange = Temperature, Green = pH.
  • Hover for details in interactive plots.